clear

use 	".\input\city_mortality_CLS\EHB_dataset.dta"

tab 	year
gen 	counter = 1 if year<=1918

bys citycode: egen yrs_lteq1918 = count(counter*mort_tot)
tab		yrs_lteq1918

keep if yrs_lteq1918==4 | ctname=="DALLAS" | ctname=="DES MOINES"

codebook citycode // 492 total cities

bys citycode: egen yrs_all = count(mort_tot)
tab		yrs_all

gen		sample_4yrs = 1
gen		sample_11yrs = (yrs_all==11)

keep sample_* year citycode ctname county state stname stfips sept14 sept1421 sept2128 sept28oct5 oct5 point_x point_y mort_tot

merge m:1 ctname stname using  "./output/intermediate/pop_main"
drop if _merge!=3
drop 	_merge

** Collapse NYC components **
*preserve
*	keep if stname=="New York" & (ctname=="BROOKLYN" | ctname=="MANHATTAN" | ctname=="QUEENS" | ctname=="BRONX" | ctname=="RICHMOND")
*	collapse (sum) mort_tot pop_* (mean) sample_*, by(year)

*******************

gen		pop_i = . 
foreach y of numlist 1915/1919 {
	replace pop_i = (`y'-1910)*(pop_1920-pop_1910)/10 + pop_1910 if year==`y'
} 
replace pop_i = pop_1920 if year==1920
foreach y of numlist 1921/1925 {
	replace pop_i = (`y'-1920)*(pop_1930-pop_1920)/10 + pop_1920 if year==`y'
} 
replace pop_i = round(pop_i)

sort 	citycode year

** Check data quality/consistency **
gen		flag = (mort_tot > 0.1*pop_i)

browse if flag==1 & !mi(mort_tot)

drop if citycode==252 | citycode==2036 | citycode==2037 // Deleting Plattsburg NY, Richmond NY, and S Bethlehem PA; inconsistent mortality and pop data 

*=========================*
** MAKE EXCESS MORTALITY **

gen		mort_rate = mort_tot/(pop_i/100000)

** First model: 1918 vs ave 1915-1917 **

xtset citycode year

gen		mort_excess1 = max(0, mort_rate - ((L.mort_rate + L2.mort_rate + L3.mort_rate)/3)) if year==1918
sum 	mort_excess1, d

browse if mort_excess>1500 & !mi(mort_excess)

** Second model: in levels, predicted 1918 value if sample_11yrs==1 **

gen		mort_excess2_level = .

levelsof citycode if sample_11yrs==1, local(cities11)

quietly {
foreach l of local cities11 {
	reg 	mort_tot year if year!=1918 & citycode==`l'
	predict mort_`l' if citycode==`l', xb
	replace mort_excess2_level = max(0, mort_tot-mort_`l') if year==1918 & citycode==`l'
	drop	mort_`l'
}
}

gen		mort_excess2 = mort_excess2_level/(pop_i/100000)
drop 	mort_excess2_level

** Last model: in rates, predicted 1918 value if sample_11yrs==1 **

gen		mort_excess3 = .

levelsof citycode if sample_11yrs==1, local(cities11)

quietly {
foreach l of local cities11 {
	reg 	mort_rate year if year!=1918 & citycode==`l'
	predict mort_`l' if citycode==`l', xb
	replace mort_excess3 = max(0, mort_rate-mort_`l') if year==1918 & citycode==`l'
	drop	mort_`l'
}
}

pwcorr mort_excess?
pwcorr mort_excess? if !mi(mort_excess1) & !mi(mort_excess2) & !mi(mort_excess3)

spearman mort_excess?
spearman mort_excess? if !mi(mort_excess1) & !mi(mort_excess2) & !mi(mort_excess3)

lab var mort_excess1 "Excess Mortality Rate, comparing 1918 to 1915-17"
lab var mort_excess2 "Excess Mortality Rate, levels predicted leaving out 1918"
lab var mort_excess3 "Excess Mortality Rate, rates predicted leaving out 1918"

keep if year==1918 | (year==1919 & ctname=="DES MOINES")
drop 	flag year

*=========================*
** Match to CLS ids 	 **

preserve
	clear
	import delim "./input/city_mortality_CLS/cls_cities_geotagged.csv"
	tempfile clsids
	save	"`clsids'", replace
restore

replace ctname = lower(ctname)
replace stname = lower(stname)

merge 1:1 ctname stname using "`clsids'", keepusing(cls_id npi_id extra_id)
drop if _merge!=3 & (ctname!="dallas" & ctname!= "des moines")
drop	_merge

compress 
save 	"./output/intermediate/cls_excessmortality_exp", replace

clear
